In [5]:
%matplotlib inline
import os
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [6]:
# Load dataset
df = pd.read_csv('dataset-1min.csv')
# Drop duplicates
df = df.drop_duplicates()
df.head(3)
Out[6]:
In [7]:
# Determine the shape of the data
print("{} instances with {} features\n".format(*df.shape))
# Determine the frequency of each class
print(df.groupby('occupancy_category')['occupancy_category'].count())
In [8]:
# Helper function to encode occupancy_category based on the number of people
def occupancy(df):
if df['occupancy_category'] == 'very-low':
return '1'
elif df['occupancy_category'] == 'low':
return '2'
elif df['occupancy_category'] == 'fair':
return '3'
else:
return '4'
df['occupancy_code'] = df.apply(occupancy, axis=1)
df.head(3)
Out[8]:
In [9]:
# Read the data into a DataFrame
features = [
'temperature',
'humidity',
'co2',
'light',
'noise',
'bluetooth_devices',
'occupancy_code'
]
classes = [
'very-low',
'low',
'fair',
'high'
]
df = df[features]
df.shape
Out[9]:
In [10]:
# Extract the target from the data
data = df.ix[:, 0:-1]
target = df.ix[:, -1]
print(data.shape)
print(target.shape)
In [11]:
# Split into test and train data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target)
In [12]:
# Standarize data
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)
In [13]:
from sklearn import metrics
from sklearn.cross_validation import KFold
from yellowbrick.classifier import ClassificationReport, ROCAUC, ClassBalance
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
In [14]:
def fit_and_evaluate(dataset, model, label, **kwargs):
"""
Because of the Scikit-Learn API, we can create a function to
do all of the fit and evaluate work on our behalf!
"""
start = time.time() # Start the clock!
scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}
for train, test in KFold(data.shape[0], n_folds=12, shuffle=True):
estimator = model(**kwargs)
estimator.fit(X_train, y_train)
expected = y_test
predicted = estimator.predict(X_test)
# Append our scores to the tracker
scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))
# Report
print("Build and Validation of {} took {:0.3f} seconds".format(label, time.time()-start))
print("Validation scores are as follows:\n")
print(pd.DataFrame(scores).mean())
In [15]:
# Perform SVC Classification
svc = SVC()
fit_and_evaluate(df, SVC, "SVM Classifier")
In [16]:
visualizer = ClassificationReport(svc, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
In [28]:
# Perform kNN Classification
knn = KNeighborsClassifier()
fit_and_evaluate(df, KNeighborsClassifier, "kNN Classifier", n_neighbors=12)
In [29]:
visualizer = ClassificationReport(knn, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
In [30]:
# Perform Random Forest Classification
rfc = RandomForestClassifier()
fit_and_evaluate(df, RandomForestClassifier, "Random Forest Classifier")
In [31]:
visualizer = ClassificationReport(rfc, classes=classes)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
g = visualizer.poof()
In [ ]: